library(data.table)
library(magrittr)
library(ggplot2); theme_set(theme_bw(base_size = 16))
library(patchwork)
data_dir <- "~/Documents/Projects/2020-09_CZI_JamesJingli/Eric_scRNAseq/data/"
We currently have the following samples:
smptab <- fread(paste0(data_dir, "samplesEric.txt"))
smptab$Sample <- paste(smptab$tissue, smptab$surgery, smptab$timepoint, smptab$run, sep = "_")
ABCutilities::my_table(smptab)
| sample | tissue | timepoint | surgery | run | Sample |
|---|---|---|---|---|---|
| 7DAYS_ISLETS_SHAM | islets | d07 | sham | run1 | islets_sham_d07_run1 |
| 7DAYS_ISLETS_LAD | islets | d07 | MI | run1 | islets_MI_d07_run1 |
| 30DAYS_BLOOD_SHAM | blood | d30 | sham | run1 | blood_sham_d30_run1 |
| 30DAYS_BLOOD_LAD | blood | d30 | MI | run1 | blood_MI_d30_run1 |
| 30DAYS_ISLETS_SHAM | islets | d30 | sham | run1 | islets_sham_d30_run1 |
| 30DAYS_ISLETS_LAD | islets | d30 | MI | run1 | islets_MI_d30_run1 |
| 1DAY_BLOOD_SHAM | blood | d01 | sham | run1 | blood_sham_d01_run1 |
| 1DAY_BLOOD_LAD | blood | d01 | MI | run1 | blood_MI_d01_run1 |
| 1DAY_ISLETS_SHAM | islets | d01 | sham | run1 | islets_sham_d01_run1 |
| 1DAY_ISLETS_LAD | islets | d01 | MI | run1 | islets_MI_d01_run1 |
| 1day_heart_MI | heart | d01 | MI | run2 | heart_MI_d01_run2 |
| 1day_heart_sham | heart | d01 | sham | run2 | heart_sham_d01_run2 |
| 1month_liver_MI | liver | d30 | MI | run2 | liver_MI_d30_run2 |
| 1month_liver_sham | liver | d30 | sham | run2 | liver_sham_d30_run2 |
| 1week_blood_MI | blood | d07 | MI | run2 | blood_MI_d07_run2 |
| 1week_blood_sham | blood | d07 | sham | run2 | blood_sham_d07_run2 |
| 1week_heart_MI | heart | d07 | MI | run2 | heart_MI_d07_run2 |
| 1week_heart_sham | heart | d07 | sham | run2 | heart_sham_d07_run2 |
| 1week_liver_sham | liver | d07 | sham | run3 | liver_sham_d07_run3 |
| 1week_liver_MI | liver | d07 | MI | run3 | liver_MI_d07_run3 |
| 1day_liver_sham | liver | d01 | sham | run3 | liver_sham_d01_run3 |
| 1day_liver_MI | liver | d01 | MI | run3 | liver_MI_d01_run3 |
| 1mo_heart_sham | heart | d30 | sham | run3 | heart_sham_d30_run3 |
| 1mo_heart_MI | heart | d30 | MI | run3 | heart_MI_d30_run3 |
For QC, I will:
paste0("sce_unfiltered_", Sys.Date(), ".rds"))Then I’ll return to the server for filtering, batch correction, clustering, dimReds, and SingleR annotation. I will combine samples of the same tissue.
# on server:
#$ conda activate czi
options(menu.graphics=FALSE)
library(SingleCellExperiment)
library(DropletUtils)
library(scater)
library(scran)
library(magrittr)
data_dir <- "/scratchLocal/frd2007/2021-04_EricJames_CZI/data/"
smptab <- read.table(paste0(data_dir, "samplesEric.txt"), header=TRUE, stringsAsFactors = FALSE)
smptab$my_sample_name <- paste(smptab$tissue, smptab$surgery, smptab$timepoint, smptab$run, sep = "_")
smpls <- unique(smptab$sample)
scel <- lapply(smpls, function(x){
mysmp <- subset(smptab, sample == x)$my_sample_name
print(paste("Reading CellRanger output for", x))
out.sce <- DropletUtils::read10xCounts(
samples = paste0(data_dir, "CellRangerOutput/", x,
"/outs/filtered_feature_bc_matrix"),
sample.names = mysmp,
version = "auto")
})
full_data <- do.call(cbind, lapply(scel, counts))
cell_info <- do.call(rbind, lapply(scel, colData))
gene_info <- rowData(scel[[1]])
## combine in one object
sce.all <- SingleCellExperiment(
list(counts = full_data),
rowData = gene_info,
colData = cell_info,
metadata = list(Samples = names(scel))
)
rm(scel); gc()
## remove completely uncovered genes
gnszero <- Matrix::rowSums(counts(sce.all)) == 0
sce.all <- sce.all[!gnszero, ]
#> dim(sce.all)
#[1] 27859 442307
## add cellnames
cellnames <- lapply(unique(sce.all$Sample), function(x){
tmp <- sce.all[, sce.all$Sample==x]
n_smp <- ncol(tmp)
outnames <- paste(x, c(1:n_smp),sep=".")
return(outnames)
}) %>% unlist
colnames(sce.all) <- cellnames
## add QC info
is.mito <- grepl("mt-", ignore.case = TRUE, rowData(sce.all)$Symbol)
sce.all <- addPerCellQC(sce.all, subsets=list(mitochondrial=is.mito))
mm <- lapply(unique(sce.all$Sample), function(x){
ss <- sce.all[, sce.all$Sample == x]$subsets_mitochondrial_percent
isOutlier(ss, type = "higher")})
sce.all$mito.discard <- unlist(mm)
sce.all$mito.discard <- ifelse(is.na(sce.all$mito.discard), FALSE, sce.all$mito.discard)
## determine GENE count thresholds for each Sample individually
gg <- lapply(unique(sce.all$Sample), function(x){
ss <- log10(sce.all[, sce.all$Sample == x]$detected)
isOutlier(ss)
})
sce.all$gene.discard <- unlist(gg)
## save colData
library(data.table)
cd <-colData(sce.all)
cd$cell <- rownames(cd)
cd <- as.data.frame(cd) %>% as.data.table
saveRDS(cd, file = paste0("colData_unfiltered_", Sys.Date(), ".rds"))
saveRDS(sce.all, file = paste0("sce_unfiltered_", Sys.Date(), ".rds"))
## load the colData locally for Rmd
qcall <- readRDS(paste0(data_dir, "colData_unfiltered_2021-06-15.rds"))
qcall$tissue <- gsub("_.*","", qcall$Sample)
qcall$surgery <- gsub(".*_([MIsham]+)_.*", "\\1", qcall$Sample)
qcall$day <- gsub(".*_(d[0-9]+)_.*", "\\1", qcall$Sample)
qcall$run <- gsub(".*_","",qcall$Sample)
for(i in unique(qcall$tissue)){
dt <- qcall[tissue == i]
P1 <- ggplot(dt,
aes(x = Sample, y = subsets_mitochondrial_percent, color = mito.discard)) +
ggbeeswarm::geom_quasirandom(size = .2, alpha = .3, shape = 1) +
theme(axis.text.x = element_text(angle=90, vjust=0.5, size=14)) +
scale_color_manual(values = rev(c("firebrick2","grey75"))) +
ylab("% mitochondrial genes") + coord_cartesian(ylim = c(0,20)) +
theme(legend.position = "bottom")+
guides(colour = guide_legend(override.aes = list(alpha = 1, size = 3, shape = 19)))
P2 <- ggplot(dt,
aes(x = subsets_mitochondrial_percent, y = log10(detected),
color = mito.discard)) +
geom_point(size = .2, shape = 1 , alpha = .5) +
facet_wrap(~Sample, ncol = 3) +
scale_color_manual(values = rev(c("firebrick2","grey75"))) +
xlab("% mitochondrial genes") + ylab("log10(total number of genes)")+
theme(legend.position = "bottom")+
guides(colour = guide_legend(override.aes = list(alpha = 1, size = 3, shape = 19)))
pw <- P1 | P2
pw <- pw + plot_annotation(title = i) + plot_layout(widths = c(1.5,3))
print(pw)
}
qcall[,
max_mito := ifelse(tissue == "islets", 10,
ifelse(tissue == "blood" & Sample %in% c("blood_MI_d30_run1","blood_sham_d30_run1", "blood_sham_d07_run2", "blood_MI_d07_run2"), 7.5,
ifelse(tissue == "blood" & Sample != "blood_MI_d07_run2", 5,
ifelse(tissue == "blood", 2,
ifelse(tissue == "heart" & surgery == "MI", 10,
ifelse(tissue == "heart", 15,
ifelse(tissue == "liver", 50, NA)))))))]
for(i in unique(qcall$tissue)){
dt <- qcall[tissue == i & subsets_mitochondrial_percent <= max_mito]
ymax <- max(dt$max_mito)
P1 <- ggplot(dt,
aes(x = Sample, y = subsets_mitochondrial_percent, color = gene.discard)) +
ggbeeswarm::geom_quasirandom(size = .2, alpha = .3, shape = 1) +
theme(axis.text.x = element_text(angle=90, vjust=0.5, size=14)) +
scale_color_manual(values = rev(c("blue","grey75"))) +
ylab("% mitochondrial genes") + coord_cartesian(ylim = c(0,ymax)) +
theme(legend.position = "bottom") +
guides(colour = guide_legend(override.aes = list(alpha = 1, size = 3, shape = 19)))
P2 <- ggplot(dt,
aes(x = subsets_mitochondrial_percent, y = log10(detected),
color = gene.discard)) +
geom_point(size = .2, shape = 1 , alpha = .5) +
facet_wrap(~Sample, ncol = 3) +
scale_color_manual(values = rev(c("blue","grey75"))) +
xlab("% mitochondrial genes") + ylab("log10(total number of genes)")+
theme(legend.position = "bottom") +
guides(colour = guide_legend(override.aes = list(alpha = 1, size = 3, shape = 19)))
pw <- P1 | P2
pw <- pw + plot_annotation(title = i) + plot_layout(widths = c(1.5,3))
print(pw)
}